path_openalex = '../'
post_year = 10
## Input: 
## 1. OpenAlexID_Year.txt (2 G)
## 2. OpenAlexID_ReferencedPaperIDs.txt (17 G)
## 3. OpenAlexID_CitingPaperIDs.txt (17 G)

## Output: 
## 1. OpenAlexID_Year_DindexTenYears.txt (1 G)


def Calculate_D_index(citations,reference_citations):
    ### Input:
    ### Citations: Future citations of the focal paper
    ### Reference_citations: Future citations of the focal paper's references

    ### Output:
    ### i, j, k type citations, total citations, D-index

    cite_num = len(citations)
    nj = len(reference_citations & citations) ### type j citations -- Papers that cite both the focal paper and its references.
    ni = len(citations) - nj ### type i citations-- Papers that exclusively cite the focal paper but disregard its references.
    nk = len(reference_citations) - nj ### type k citations-- Papers that solely cite the references of the focal paper but not the focal paper itself.
    disrupt_score = float(ni-nj)/(ni+nj+nk)
    return disrupt_score


## Section 1: Data preparation
pid_y = {}
n = 0

### Format: pid_y -> {paperid: publication year}
with open(path_openalex+'OpenAlexID_Year.txt','r') as f:
    for line in f:
        n += 1
        if n%1000000==0:
            print(n/1000000)
        line = line.strip('\n').split('\t')
        pid_y[line[0]] = int(line[1])
print(1,len(pid_y))

cit_dict = {}

### Format: cit_dict -> {paperid: [citing paper1 id, citing paper2 id, ...]}
with open(path_openalex+'OpenAlexID_CitingPaperIDs.txt','r') as f:
    for line in f:
        line = line.strip('\n').split('\t')
        cit_dict[line[0]] = line[1:]
print(2,len(cit_dict))


## Section 2: D-index calculation process: 
### a. Read references of a focal paper one at a time.
### b. Extract future citations of the focal paper and future citations of its references from cit_dict.
### c. Compute the disruption score (D-index) over the ten-year time window and store it in the output file.
print('3')
count = 0
valid_paper = 0
count1 = 0
with open(path_openalex+'OpenAlexID_Year_DindexTenYears.txt'+'.txt', 'w') as fw:
    with open(path_openalex+'OpenAlexID_ReferencedPaperIDs.txt','r') as f:
        for line in f:
            line = line.strip('\n').split('\t')
            paperid = line[0]
            foc_y = pid_y[paperid]
            if foc_y<1800 or foc_y>2014:
                continue
            count += 1

            ## citation: papers that cite the focal paper
            citation = cit_dict.get(paperid, [])
            ## reference: references being cited by the focal paper
            reference = line[1:]
            if len(citation) == 0:
                count1 += 1
                continue
            ## reference_citation: papers that cite the references of the focal paper
            
            reference_citation = [] 
            for item in reference:
                cites = cit_dict.get(item,[])
                reference_citation.extend(cites)

            resulttemp = []
            resulttemp.append(foc_y)

            reference_citation_ = set([p for p in set(reference_citation) \
                    if pid_y[p]<=foc_y+post_year])
            citation_ = set([p for p in set(citation) \
                    if pid_y[p]<=foc_y+post_year])
            if len(citation_) == 0:
                count1 += 1
                continue

            valid_paper += 1
            resulttemp.append(Calculate_D_index(citation_,reference_citation_))
            fw.write(str(paperid)+'\t'+'\t'.join(list(map(str,resulttemp)))+'\n')

            if count%1000==1:
                print(int(count/1000), resulttemp)

print(count, 'total papers.')
print(valid_paper, 'valid papers.')
print(count1, 'no citation papers.')

print('done.')